{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ec889d86-0d16-477f-8b7f-be03d73ad957",
   "metadata": {},
   "source": [
    "# Lab 1 - Overview of embeddings-based retrieval"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "58ee2f53-d88b-4f00-94a2-75a66d4149e9",
   "metadata": {},
   "source": [
    "Welcome! Here's a few notes about the Chroma course notebooks.\n",
    " - A number of warnings pop up when running the notebooks. These are normal and can be ignored.\n",
    " - Some operations such as calling an LLM or an opeation using generated data return unpredictable results and so your notebook outputs may differ from the video.\n",
    "  \n",
    "Enjoy the course!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a5536f0-651c-40e7-aa15-27ee0cda80b7",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "from helper_utils import word_wrap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3748b16d-d4a7-49c3-a48a-57dcfc42acd6",
   "metadata": {
    "height": 165
   },
   "outputs": [],
   "source": [
    "from pypdf import PdfReader\n",
    "\n",
    "reader = PdfReader(\"microsoft_annual_report_2022.pdf\")\n",
    "pdf_texts = [p.extract_text().strip() for p in reader.pages]\n",
    "\n",
    "# Filter the empty strings\n",
    "pdf_texts = [text for text in pdf_texts if text]\n",
    "\n",
    "print(word_wrap(pdf_texts[0]))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bbd608a5-b6c9-4ae9-a871-a3e470a4d12a",
   "metadata": {},
   "source": [
    "You can view the pdf in your browser [here](./microsoft_annual_report_2022.pdf) if you would like. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a338ec83-6301-41a5-9ab1-e5d583306a3f",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "888a86f8-2fe2-4682-bdaf-c15129ed1a32",
   "metadata": {
    "height": 182
   },
   "outputs": [],
   "source": [
    "character_splitter = RecursiveCharacterTextSplitter(\n",
    "    separators=[\"\\n\\n\", \"\\n\", \". \", \" \", \"\"],\n",
    "    chunk_size=1000,\n",
    "    chunk_overlap=0\n",
    ")\n",
    "character_split_texts = character_splitter.split_text('\\n\\n'.join(pdf_texts))\n",
    "\n",
    "print(word_wrap(character_split_texts[10]))\n",
    "print(f\"\\nTotal chunks: {len(character_split_texts)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5665c695-22ea-4264-b1ac-5ba720b6d78b",
   "metadata": {
    "height": 165
   },
   "outputs": [],
   "source": [
    "token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)\n",
    "\n",
    "token_split_texts = []\n",
    "for text in character_split_texts:\n",
    "    token_split_texts += token_splitter.split_text(text)\n",
    "\n",
    "print(word_wrap(token_split_texts[10]))\n",
    "print(f\"\\nTotal chunks: {len(token_split_texts)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2a13d14-4484-46f0-8e67-277337f9d138",
   "metadata": {
    "height": 114
   },
   "outputs": [],
   "source": [
    "import chromadb\n",
    "from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction\n",
    "\n",
    "embedding_function = SentenceTransformerEmbeddingFunction()\n",
    "print(embedding_function([token_split_texts[10]]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ba6c8c5-9ce4-44d0-9223-6fdd77871f87",
   "metadata": {
    "height": 148
   },
   "outputs": [],
   "source": [
    "chroma_client = chromadb.Client()\n",
    "chroma_collection = chroma_client.create_collection(\"microsoft_annual_report_2022\", embedding_function=embedding_function)\n",
    "\n",
    "ids = [str(i) for i in range(len(token_split_texts))]\n",
    "\n",
    "chroma_collection.add(ids=ids, documents=token_split_texts)\n",
    "chroma_collection.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bfdb54db-a442-423c-b006-c33a257cd7d7",
   "metadata": {
    "height": 165
   },
   "outputs": [],
   "source": [
    "query = \"What was the total revenue?\"\n",
    "\n",
    "results = chroma_collection.query(query_texts=[query], n_results=5)\n",
    "retrieved_documents = results['documents'][0]\n",
    "\n",
    "for document in retrieved_documents:\n",
    "    print(word_wrap(document))\n",
    "    print('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "377a84aa-1d93-4e97-9b2d-d59c46355338",
   "metadata": {
    "height": 165
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import openai\n",
    "from openai import OpenAI\n",
    "\n",
    "from dotenv import load_dotenv, find_dotenv\n",
    "_ = load_dotenv(find_dotenv()) # read local .env file\n",
    "openai.api_key = os.environ['OPENAI_API_KEY']\n",
    "\n",
    "openai_client = OpenAI()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba0ed8ca-6640-4c09-9cb3-9de5e7cf46dc",
   "metadata": {
    "height": 335
   },
   "outputs": [],
   "source": [
    "def rag(query, retrieved_documents, model=\"gpt-3.5-turbo\"):\n",
    "    information = \"\\n\\n\".join(retrieved_documents)\n",
    "\n",
    "    messages = [\n",
    "        {\n",
    "            \"role\": \"system\",\n",
    "            \"content\": \"You are a helpful expert financial research assistant. Your users are asking questions about information contained in an annual report.\"\n",
    "            \"You will be shown the user's question, and the relevant information from the annual report. Answer the user's question using only this information.\"\n",
    "        },\n",
    "        {\"role\": \"user\", \"content\": f\"Question: {query}. \\n Information: {information}\"}\n",
    "    ]\n",
    "    \n",
    "    response = openai_client.chat.completions.create(\n",
    "        model=model,\n",
    "        messages=messages,\n",
    "    )\n",
    "    content = response.choices[0].message.content\n",
    "    return content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28bac3a2-0d29-48dc-9b48-2d9313239a25",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "output = rag(query=query, retrieved_documents=retrieved_documents)\n",
    "\n",
    "print(word_wrap(output))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db9f2758-0f5a-49e5-b1fa-517b91324575",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aee59493-8a99-4da8-b94f-4747efcfc79d",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5eda9bc-ae76-4db6-9e0c-ae099d852d78",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1183e75-4c65-422e-bc47-48010d8b29c9",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fcd85cc-8898-41ed-a0aa-bd8a33fc565a",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c65337e9-85ee-47f7-89fd-7fe77cd0e1b2",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7794092-4195-4cf3-9eab-11c9c05a26b9",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f2cab7a1-1be7-45f0-83b7-543e48f83901",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe0343be-73c9-4aed-83b0-aba09569ac87",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f0f3e33-e517-4f6b-8b38-c47c1e3d40b4",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a16fdcb1-57d0-4f04-af8f-7c7fc594d947",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "babe7893-9cbc-43c5-94ef-cbf8f5d68cf2",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60a9524b-1085-4bdf-a161-39f11397dc1f",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d189f088-b58e-4583-9590-afdfa624cf87",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b26a01a-4575-446b-b8dc-a8c5ab153172",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0950575b-b69d-46a3-8c91-c7af89f5c204",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f123ad8-b2e8-4a25-8b42-a520ecaf566b",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "83c04587-d1de-419c-a213-2e3eb67dc33d",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3155972-824e-4ebe-a692-2227c113c5a8",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8144a4a-85f6-4800-87f9-36a1b6ceda1f",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ff0b18e-12a0-4ac0-97dd-8618b22e7dbf",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03ca7e7c-4b47-4652-9b46-a40b3dffa5e6",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f74e7d67-7f51-41c4-8e25-edbaa02d0bd8",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9188e886-d406-406f-b234-f5c3353a77a2",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d3bb286-2694-4ed4-8466-46865e997ced",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2876084b-4038-4b0c-8ec8-8294a86adfc1",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ac542e1-b094-431f-9611-cf7e36d3f0de",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bcd6114b-c09d-4173-a623-9a08aaf63e4b",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad10ab65-b351-4f4b-b7d2-63474acfb9f9",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "800f3d81-cbdb-4ba4-8d49-85747fdfded8",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37847448-c9f6-4f51-bf06-f7809964a8b2",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2dcefc87-0964-4b94-946b-2145781ad606",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fc994bc-7b1e-476a-9df9-300a3e374882",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9ef5f5d5-acb7-4b0a-93ef-e61306708e69",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44e4b33f-d8fb-4f3a-b884-8b43a3766583",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2a480a2-2c29-4a01-80dd-ee41934b7901",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8127c2bf-0d15-4b62-b46a-f7a17ad2ec92",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18ded129-a637-4269-a116-550fe9a90570",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c1d7ee44-7b29-483f-a3f2-cc9d8e18880e",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e450dd8-9719-42c6-8c3c-33cac910e0a5",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
